{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from mpl_toolkits import mplot3d\n", "import seaborn as sns\n", "import numpy as np\n", "\n", "import scipy.cluster.hierarchy as shc\n", "\n", "from sklearn.datasets.samples_generator import make_blobs\n", "from sklearn.datasets.samples_generator import make_circles\n", "from sklearn.datasets.samples_generator import make_moons\n", "\n", "from sklearn.cluster import AgglomerativeClustering\n", "from sklearn.cluster import KMeans\n", "\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.metrics import silhouette_score\n", "from sklearn.metrics import silhouette_samples\n", "\n", "from sklearn.decomposition import PCA\n", "\n", "from sklearn import datasets\n", "\n", "%matplotlib inline\n", "pd.set_option(\"display.max_columns\", None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Lab 24 - Simulated clusters\n", "\n", "The following code will create 3 clusters in 3-dimensional space using 100 data points. The coordinates of the data points are given in X and which cluster they belong to is given in y." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "X, y = make_blobs(n_samples=100, centers=3, n_features=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Display X." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ -5.63865638, 11.14197327, 7.16177736, 7.86528225,\n", " -5.92051105, 1.43115664, -8.73929625, 0.78282794,\n", " -4.55496349, -3.69300444],\n", " [ 1.29014194, 7.93595136, -10.42814878, 4.76282079,\n", " -5.62022996, 3.46953746, -0.30472009, -4.02592815,\n", " -3.69404824, -8.58295791],\n", " [ 9.86294581, -6.12532891, -9.95068937, -4.4877234 ,\n", " 2.25457323, -6.94432921, -2.01939864, 4.40875096,\n", " 8.08598599, 0.60009 ],\n", " [ -7.26600385, 8.94760342, 7.58345195, 6.59360261,\n", " -6.14318843, -2.33604773, -10.79134719, -0.23519408,\n", " -4.3289965 , -3.23619435],\n", " [ -6.20591753, 9.3940471 , 6.84313925, 6.99905781,\n", " -3.74922807, -3.19729309, -8.78228476, 1.08450129,\n", " -5.88903982, -2.29606982],\n", " [ -5.91570796, 10.60464114, 5.46919513, 4.04803957,\n", " -6.82430791, -2.53118788, -8.23074054, -0.59961669,\n", " -6.0298296 , -2.94390571],\n", " [ 8.37867089, -6.20409551, -8.40417244, -4.9109817 ,\n", " 2.75707251, -9.130333 , -1.29632154, 5.53120426,\n", " 9.67394784, -0.18703975],\n", " [ -4.98431218, 10.5679459 , 6.54053495, 5.71304771,\n", " -4.83182407, 0.08025859, -9.46770544, 1.76671603,\n", " -5.53126893, -5.19364105],\n", " [ 4.25735344, 7.898224 , -9.79959917, 3.95898697,\n", " -5.25757979, 1.98962049, -1.42414732, -5.28741557,\n", " -2.66760448, -9.87332364],\n", " [ -7.3651727 , 10.69085091, 6.17814642, 7.57515375,\n", " -4.10679026, -2.83772654, -7.93767406, 2.75539456,\n", " -6.25835215, -1.33789716],\n", " [ 10.16655218, -5.90829939, -9.99695744, -6.19932777,\n", " 3.68676939, -8.02330746, -3.74720058, 4.73729075,\n", " 8.72343685, 0.17077219],\n", " [ 1.54916149, 9.75486969, -7.68423796, 5.9938356 ,\n", " -6.14502269, 1.12072297, -2.41909992, -4.59320946,\n", " -4.17208608, -7.55561386],\n", " [ 9.97989303, -5.46849247, -9.73153854, -4.24258773,\n", " 2.3364311 , -7.95451743, -3.16623239, 4.24797385,\n", " 9.35424853, -1.47366028],\n", " [ -6.53856075, 10.48550573, 8.20393127, 4.71187262,\n", " -6.64234351, -2.63139753, -10.32673328, 0.99065337,\n", " -6.27971946, -3.32210243],\n", " [ 11.09404503, -4.9682383 , -8.94566247, -3.18526928,\n", " 5.20808422, -8.26965836, -3.40659368, 4.61395709,\n", " 8.65082174, 1.10788947],\n", " [ 2.09431434, 8.41970679, -10.53822477, 4.05428003,\n", " -4.80696425, 0.47371163, -1.98467603, -6.19397131,\n", " -3.28271641, -9.82436284],\n", " [ 12.02455375, -5.40282534, -8.95204075, -3.04438716,\n", " 3.04683713, -8.24539845, -2.19224858, 3.93543223,\n", " 7.13152884, -0.34998517],\n", " [ 1.47900485, 6.6179144 , -10.84208307, 4.5122211 ,\n", " -3.57083753, 3.18063014, -1.71405583, -5.49218412,\n", " -4.23016592, -8.98436064],\n", " [ -5.01168052, 8.51397446, 6.88345944, 5.788825 ,\n", " -5.50135145, -3.33498872, -12.04947685, 0.27893702,\n", " -6.42839265, -2.64195681],\n", " [ 2.71730368, 7.74606284, -10.09645029, 4.32783147,\n", " -4.28907317, 2.32940236, -1.00963213, -5.61651022,\n", " -4.90888637, -7.94481733],\n", " [ 9.36126858, -3.77244865, -8.54831342, -2.09401618,\n", " 2.66556586, -8.90664145, -4.33514159, 5.54958627,\n", " 8.04218273, 0.91844448],\n", " [ 9.19793644, -5.85616758, -9.09110606, -1.28706811,\n", " 1.15191389, -6.34113258, -2.88160696, 5.06681541,\n", " 9.79547627, 0.25465343],\n", " [ 2.53868669, 8.90560249, -8.60912502, 4.04648929,\n", " -6.58588319, 1.0230708 , -2.03512892, -4.90202924,\n", " -5.62290008, -8.83806196],\n", " [ 2.78042035, 8.78668777, -9.14660371, 5.05929348,\n", " -5.34038557, 1.27825456, -3.7120683 , -4.38078916,\n", " -4.88421294, -7.2257453 ],\n", " [ 10.2804298 , -5.7188211 , -9.56164909, -3.88861448,\n", " 3.82508171, -7.16415704, -4.03012389, 3.97526155,\n", " 8.8673903 , 0.09290071],\n", " [ 2.72798656, 8.0990256 , -8.29501931, 3.10434307,\n", " -6.07604573, 2.47887715, -2.59301458, -4.66501597,\n", " -3.91942941, -5.68119889],\n", " [ 1.70132497, 7.47030995, -9.27987446, 3.15673513,\n", " -4.03373936, 1.86547945, -3.93644396, -4.97207995,\n", " -1.12621966, -8.26951634],\n", " [ 8.37113909, -4.03623283, -8.59709749, -2.64927194,\n", " 2.69001427, -7.95439155, -3.23340502, 3.82780972,\n", " 8.87726149, 1.98572851],\n", " [ 7.51444311, -7.77898805, -9.75957623, -4.16471399,\n", " 3.98224455, -9.57730398, -3.36997348, 4.21377525,\n", " 9.75936163, 1.60995023],\n", " [ 1.00843421, 8.91260364, -9.95707889, 5.43021714,\n", " -5.15852567, 2.83173744, -3.20558847, -4.77697942,\n", " -3.30223787, -9.14712338],\n", " [ 0.81439874, 9.28068972, -8.32080612, 5.18986974,\n", " -4.63765129, 2.18293639, -2.27352666, -6.87249994,\n", " -5.87388383, -7.17174877],\n", " [ 1.41118954, 9.31231351, -8.70335677, 3.08945581,\n", " -5.0610114 , 2.05201176, -1.45644113, -5.49613143,\n", " -3.23177375, -9.03239195],\n", " [ 3.82162125, 7.06084463, -9.97598433, 4.74969678,\n", " -4.64502172, 0.57789786, -2.37853964, -5.60871417,\n", " -2.43258528, -6.70381283],\n", " [ -6.04961067, 9.00930329, 7.05710111, 4.30285236,\n", " -4.55567247, -1.14258082, -9.10698423, 1.70057331,\n", " -4.05785218, -3.47294729],\n", " [ 9.06728442, -4.00416241, -9.99447282, -2.63660295,\n", " 1.31080728, -9.82139838, -3.97451862, 4.47776654,\n", " 8.08713238, -0.67001681],\n", " [ 8.73345109, -6.93488115, -8.3810281 , -4.26818106,\n", " 1.81134557, -7.52743298, -1.77582304, 5.16838293,\n", " 8.10107565, -0.79381975],\n", " [ 1.74819017, 8.16118947, -9.90576832, 4.79738654,\n", " -5.42263869, 2.10918741, -3.37941128, -5.7525168 ,\n", " -1.43751066, -7.72815703],\n", " [ 0.47887354, 6.64722637, -8.87581419, 4.37706338,\n", " -4.50057619, 3.91125004, -3.99785584, -3.64640811,\n", " -4.09099286, -7.20453202],\n", " [ 1.40299372, 6.56936992, -10.367435 , 4.18490457,\n", " -4.74828276, 3.3039967 , -0.88339102, -4.45801403,\n", " -3.0234176 , -6.85572322],\n", " [ 8.0814534 , -6.66472986, -8.86963651, -3.14933092,\n", " 2.70031944, -7.70671741, -3.15655246, 4.47687668,\n", " 6.84586227, 0.57284156],\n", " [ -7.61731007, 9.96940438, 5.9050751 , 7.36461477,\n", " -6.28026981, -0.91248466, -9.66006766, 1.26285408,\n", " -6.87422557, -2.86640255],\n", " [ -4.75404592, 9.14612833, 7.33969268, 8.12378285,\n", " -3.64090915, -2.44587576, -9.14293016, -2.01206008,\n", " -6.4390322 , -3.72867452],\n", " [ 3.07575894, 9.10126862, -9.69402913, 3.27930982,\n", " -3.71321299, 3.58709529, -2.02875134, -4.81074146,\n", " -1.54203008, -6.89553021],\n", " [ -5.04115967, 8.4308978 , 6.70919358, 5.61337293,\n", " -6.15346565, -0.09517894, -9.35268127, 1.44524569,\n", " -7.0141909 , -3.93320794],\n", " [ 3.3710506 , 8.06548499, -11.19828446, 3.55950518,\n", " -4.27920766, 1.68146582, -1.58883053, -5.04734261,\n", " -3.58881484, -7.43129031],\n", " [ -7.37884053, 10.31110429, 6.45771056, 5.29923256,\n", " -4.91350913, -0.08245097, -7.19125988, -0.49320338,\n", " -5.89893184, -2.52807592],\n", " [ 2.78377232, 9.14721839, -9.31752868, 5.02971054,\n", " -5.40625369, 2.69034883, -2.60050592, -4.87641085,\n", " -3.73922139, -7.75700781],\n", " [ -7.18815461, 11.04003455, 6.75103198, 6.11326414,\n", " -5.4970891 , -1.46780011, -8.50699051, -1.55312774,\n", " -6.35448352, -3.48780275],\n", " [ -7.33119051, 8.62272408, 6.94441052, 6.40586215,\n", " -5.76835605, -1.34003481, -9.0301014 , 1.75025808,\n", " -2.69871209, -3.2548357 ],\n", " [ 8.16081328, -5.99862145, -7.91721126, -2.370494 ,\n", " 2.90497899, -7.45672067, -2.54101324, 4.59848598,\n", " 9.22964778, 0.49882278],\n", " [ 9.7696569 , -6.00776103, -9.69767255, -4.89342145,\n", " 3.7240956 , -8.98360574, -2.63503104, 4.3804196 ,\n", " 9.05096306, 0.62325874],\n", " [ -6.62373567, 9.51390792, 6.57593791, 5.51908343,\n", " -7.41938706, -0.80685954, -9.25260921, 1.6334159 ,\n", " -4.96724754, -5.75310421],\n", " [ -6.54947508, 8.21075496, 5.98142599, 6.8623284 ,\n", " -5.82860947, -0.1964107 , -9.58363144, -0.2691615 ,\n", " -7.03283868, -4.82213621],\n", " [ -8.22757979, 10.21218935, 6.67930734, 4.72874379,\n", " -5.77306863, -1.59864838, -10.23818184, 1.28505418,\n", " -6.05893205, -4.45681849],\n", " [ 1.55990954, 6.87104959, -8.85260758, 3.66404014,\n", " -5.76899128, 4.55590811, -2.63324431, -5.25340264,\n", " -4.89000981, -7.43409608],\n", " [ -6.27440486, 10.30166091, 6.71238306, 4.91097493,\n", " -4.64580298, -0.11511814, -8.93467996, 0.0448647 ,\n", " -6.25303401, -3.35757954],\n", " [ 9.43000819, -5.78981912, -8.59817638, -4.22743633,\n", " 2.51454518, -6.3104251 , -2.75120535, 6.02089827,\n", " 7.48996685, -0.05361179],\n", " [ 9.07995884, -4.99595319, -8.03834667, -1.41131039,\n", " 4.00753872, -8.23834899, -1.51009624, 5.77921999,\n", " 8.71761315, -0.37391035],\n", " [ -5.93444463, 10.03973678, 7.95319124, 5.4015418 ,\n", " -6.21740785, -2.68832729, -8.88069948, 1.03657615,\n", " -4.29088438, -3.42005618],\n", " [ 9.13175683, -6.73911303, -10.7367822 , -3.60846508,\n", " 0.74907068, -8.1588166 , -1.07751216, 4.60536363,\n", " 7.95054484, 0.31876051],\n", " [ 10.86253562, -7.07446281, -9.29746313, -1.89012594,\n", " 3.36756076, -6.28227592, -2.98846772, 5.83817515,\n", " 8.12268383, 1.75382139],\n", " [ 1.84115507, 7.51679352, -9.87726477, 3.61013118,\n", " -6.99482338, 3.72644289, -1.93215923, -5.76219257,\n", " -2.46753252, -7.01066686],\n", " [ 9.80897671, -5.62218042, -9.90041716, -3.11606894,\n", " 3.73248709, -8.26786291, -3.65023563, 3.25941227,\n", " 8.47262128, 0.77333532],\n", " [ 9.45596022, -5.51744562, -8.37475918, -3.61682585,\n", " 2.98605043, -7.24207429, -2.33866245, 4.30073296,\n", " 7.5418236 , 0.49227357],\n", " [ -5.67049016, 9.30610834, 7.29036502, 4.20517583,\n", " -4.00258202, -2.59114575, -8.93379167, 2.21217295,\n", " -4.96055031, -2.90632493],\n", " [ 2.04923699, 6.37214213, -10.35282331, 4.99626088,\n", " -4.67049424, 2.88594498, -2.13902452, -6.36901439,\n", " -2.92377702, -8.21687154],\n", " [ -6.92688832, 10.60191331, 7.71190434, 5.34421828,\n", " -6.88321786, -2.45686221, -8.90129962, -0.19048209,\n", " -4.5596019 , -2.72908716],\n", " [ 11.642012 , -6.36696716, -9.05398188, -1.89334533,\n", " 3.89363839, -8.0538518 , -1.88951523, 5.27760215,\n", " 10.41180319, 0.33349076],\n", " [ 10.61867337, -6.31147104, -9.37431724, -4.1544443 ,\n", " 4.71296835, -9.06295715, -3.16407732, 3.52281301,\n", " 8.29815677, 0.29910613],\n", " [ -6.15304253, 10.63998356, 6.57188038, 4.71437688,\n", " -4.43994365, -1.60521899, -8.92195329, 0.94226268,\n", " -6.8864513 , -3.10416068],\n", " [ 1.40665876, 6.52810918, -9.34006295, 6.16948003,\n", " -5.51641072, 2.94435196, -4.60719183, -6.63053305,\n", " -3.66827433, -9.50428201],\n", " [ -5.65581213, 10.99509461, 6.1924525 , 6.96357882,\n", " -6.28250559, -3.64230853, -9.91909133, -0.17749956,\n", " -5.95538631, -2.23082661],\n", " [ 11.89476959, -7.15557706, -8.06797013, -4.61913362,\n", " 4.31575359, -9.47466501, -3.73573576, 6.45767257,\n", " 5.02865867, 1.45090498],\n", " [ -6.70221205, 11.00273338, 7.36390867, 4.34634178,\n", " -4.17352987, -1.03392243, -10.15707244, 1.01988597,\n", " -4.67050258, -3.72444575],\n", " [ 2.1841777 , 7.36770283, -7.51018241, 2.93246674,\n", " -5.06796018, 3.46893265, -2.23816336, -6.25461272,\n", " -4.11334148, -9.27431208],\n", " [ 10.51372464, -3.61674727, -9.41021241, -4.49457504,\n", " 2.69442413, -6.87929874, -2.72243409, 5.09139952,\n", " 6.00175599, -1.57642207],\n", " [ 3.01069063, 7.68179071, -9.32920751, 6.03425515,\n", " -3.45675673, 2.00146116, -4.42769193, -3.9919336 ,\n", " -2.96204281, -6.98827782],\n", " [ -6.2268659 , 8.70898193, 6.96348485, 7.02194576,\n", " -7.19771432, -1.088074 , -8.97118168, -0.55686889,\n", " -6.93448738, -2.499479 ],\n", " [ 10.90776759, -5.61894403, -8.34289389, -4.45180947,\n", " 0.75041304, -8.93510895, -4.1403297 , 3.99746688,\n", " 8.80545046, 0.53177505],\n", " [ -8.30839001, 11.15571344, 8.0566503 , 6.4540809 ,\n", " -4.52361655, -2.31788753, -9.18441555, 1.13443817,\n", " -6.327716 , -3.69979882],\n", " [ 0.33755585, 7.1828311 , -9.81713313, 4.53773194,\n", " -4.06636173, 2.35487865, -1.7849053 , -4.66571464,\n", " -4.68592684, -7.0252872 ],\n", " [ 1.94752325, 7.99642884, -9.05931863, 5.60989924,\n", " -6.28684763, 3.42344236, -0.68638723, -4.29758352,\n", " -6.03048227, -8.26642599],\n", " [ -6.51502203, 10.90112722, 6.77960302, 5.90148205,\n", " -5.24327018, -1.44486398, -8.97202689, 0.15937519,\n", " -4.86480238, -4.77813938],\n", " [ 11.47402784, -5.3108068 , -11.60375735, -4.360209 ,\n", " 3.42167976, -7.51275566, -1.7793791 , 2.98280429,\n", " 8.37743561, 1.45310145],\n", " [ 10.37585095, -6.93827655, -8.06996186, -4.92117081,\n", " 3.72103749, -8.35684158, -3.28171246, 2.71213958,\n", " 10.43529303, -1.24091602],\n", " [ -7.62659963, 9.80133439, 7.02025482, 4.75826417,\n", " -6.60234581, -2.17998355, -10.00292421, 2.12422897,\n", " -5.11672306, -4.00799509],\n", " [ 4.16543798, 8.8057768 , -8.90356876, 4.07857587,\n", " -5.51318314, 2.7500266 , -3.47568451, -4.3522069 ,\n", " -4.90747476, -7.7011194 ],\n", " [ 9.18360496, -5.97141365, -10.21315598, -4.52291312,\n", " 4.22831291, -8.60423127, -1.20598987, 4.38704225,\n", " 8.8186819 , -1.39778001],\n", " [ 1.26686137, 8.72822739, -10.25210642, 5.02877833,\n", " -5.27079876, 3.11717473, -4.14947642, -4.28907811,\n", " -2.71465733, -6.67342221],\n", " [ 3.58782049, 8.36127508, -8.51593328, 4.49701951,\n", " -6.0794952 , 3.65457204, -1.89841585, -6.73140905,\n", " -1.84505963, -7.79058691],\n", " [ 11.01239971, -6.65451023, -7.64801521, -4.17331405,\n", " 3.78355219, -9.50560594, -2.46694677, 6.85600629,\n", " 8.2841873 , 1.44118554],\n", " [ -6.25578692, 9.83871722, 6.29388228, 6.98249754,\n", " -5.69239928, -1.30015791, -8.77081867, 0.12485323,\n", " -3.99681507, -1.67624355],\n", " [ -6.35669119, 11.63309114, 6.8628025 , 5.18280529,\n", " -3.78635494, -2.48668124, -10.58398322, -0.80536908,\n", " -4.74855514, -2.36996 ],\n", " [ 1.7136007 , 5.39157553, -9.11639074, 5.28449095,\n", " -5.71978471, 3.48523713, -3.82048051, -5.51674723,\n", " -3.96021584, -9.3934277 ],\n", " [ -5.82659009, 11.15087869, 7.15803379, 6.06686272,\n", " -4.76377128, -2.61302825, -9.66256542, 2.22069732,\n", " -6.68582621, -3.58539799],\n", " [ 1.14728092, 8.5256735 , -8.15893839, 4.42519706,\n", " -5.18929784, 2.94305348, -3.40024378, -5.27960921,\n", " -3.97083355, -7.14775022],\n", " [ -5.84402192, 8.15236221, 6.84728669, 7.07082746,\n", " -2.87556143, -2.60096743, -8.91920786, 1.5107383 ,\n", " -5.36518255, -4.37456514],\n", " [ 10.99941425, -6.27742495, -9.1404513 , -2.62402282,\n", " 2.08965849, -7.7161278 , -2.6293763 , 4.85982046,\n", " 5.90488252, 0.93305878],\n", " [ 10.08352155, -5.33783486, -7.80831943, -3.41598225,\n", " 1.5887739 , -6.51898542, -2.6566093 , 4.11159175,\n", " 7.9608816 , 2.0861752 ],\n", " [ 0.78928272, 6.38627183, -9.13736714, 2.17240011,\n", " -6.1378357 , 3.56063468, -3.62020213, -5.09935168,\n", " -2.97072185, -7.77896021]])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Display y." ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2, 0, 1, 2, 2, 2, 1, 2, 0, 2, 1, 0, 1, 2, 1, 0, 1, 0, 2, 0, 1, 1,\n", " 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 1, 2, 2, 0, 2,\n", " 0, 2, 0, 2, 2, 1, 1, 2, 2, 2, 0, 2, 1, 1, 2, 1, 1, 0, 1, 1, 2, 0,\n", " 2, 1, 1, 2, 0, 2, 1, 2, 0, 1, 0, 2, 1, 2, 0, 0, 2, 1, 1, 2, 0, 1,\n", " 0, 0, 1, 2, 2, 0, 2, 0, 2, 1, 1, 0])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Visualize the clusters in 2 dimensions using PCA. First create a PCA object and find the new X coordinatese in 2 dimensions." ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "pca = PCA(n_components = 2)\n", "pca.fit(X)\n", "new_X = pca.transform(X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a new dataframe containing the new X coordinates and a column with the cluster number." ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PC1PC2cluster
0-16.322363-6.3278122
1-4.58238512.2718280
220.070119-2.2124481
3-15.318259-9.0103052
4-14.074522-8.9181842
5-14.461845-6.3053452
620.459791-4.6107451
7-14.910912-6.6551472
8-2.97447412.3107340
9-14.894593-9.0096172
1021.211358-3.3469071
11-6.7585649.5064860
1219.930349-2.3263771
13-16.034294-9.3097812
1420.519449-3.9088101
15-3.94825512.1091260
1619.759737-2.1991201
17-3.79628912.3870080
18-14.127102-9.1449382
19-4.04062512.0951430
2017.956707-4.6443231
2118.639127-2.8696921
22-5.90251410.8877210
23-5.1845739.9820470
2420.075349-2.8774451
25-4.3569199.4892300
26-2.9733249.5789130
2717.997692-4.0131111
2821.304722-5.0414561
29-5.75240611.4483070
............
70-5.73158611.4341020
71-15.283114-7.7869952
7220.828041-5.5544371
73-15.022794-8.5418942
74-5.28550710.8915680
7517.887968-1.5912551
76-3.6592209.3898040
77-15.943505-7.0869742
7819.648554-3.9703991
79-16.975697-9.1208602
80-4.66828110.4141580
81-6.25910411.8569120
82-15.496020-6.9811832
8321.456248-0.7030641
8421.031956-3.5536551
85-15.238078-8.9959412
86-4.81491910.7915820
8720.818377-2.2225491
88-4.87178310.4162240
89-4.36871511.8211730
9021.399042-6.0571731
91-14.306515-7.6228442
92-14.983878-8.3662042
93-4.87020011.1019290
94-15.452853-8.6950382
95-6.0211979.7977840
96-13.303403-8.5335462
9719.071838-2.8235961
9818.448396-3.6464381
99-4.23301510.1571800
\n", "

100 rows × 3 columns

\n", "
" ], "text/plain": [ " PC1 PC2 cluster\n", "0 -16.322363 -6.327812 2\n", "1 -4.582385 12.271828 0\n", "2 20.070119 -2.212448 1\n", "3 -15.318259 -9.010305 2\n", "4 -14.074522 -8.918184 2\n", "5 -14.461845 -6.305345 2\n", "6 20.459791 -4.610745 1\n", "7 -14.910912 -6.655147 2\n", "8 -2.974474 12.310734 0\n", "9 -14.894593 -9.009617 2\n", "10 21.211358 -3.346907 1\n", "11 -6.758564 9.506486 0\n", "12 19.930349 -2.326377 1\n", "13 -16.034294 -9.309781 2\n", "14 20.519449 -3.908810 1\n", "15 -3.948255 12.109126 0\n", "16 19.759737 -2.199120 1\n", "17 -3.796289 12.387008 0\n", "18 -14.127102 -9.144938 2\n", "19 -4.040625 12.095143 0\n", "20 17.956707 -4.644323 1\n", "21 18.639127 -2.869692 1\n", "22 -5.902514 10.887721 0\n", "23 -5.184573 9.982047 0\n", "24 20.075349 -2.877445 1\n", "25 -4.356919 9.489230 0\n", "26 -2.973324 9.578913 0\n", "27 17.997692 -4.013111 1\n", "28 21.304722 -5.041456 1\n", "29 -5.752406 11.448307 0\n", ".. ... ... ...\n", "70 -5.731586 11.434102 0\n", "71 -15.283114 -7.786995 2\n", "72 20.828041 -5.554437 1\n", "73 -15.022794 -8.541894 2\n", "74 -5.285507 10.891568 0\n", "75 17.887968 -1.591255 1\n", "76 -3.659220 9.389804 0\n", "77 -15.943505 -7.086974 2\n", "78 19.648554 -3.970399 1\n", "79 -16.975697 -9.120860 2\n", "80 -4.668281 10.414158 0\n", "81 -6.259104 11.856912 0\n", "82 -15.496020 -6.981183 2\n", "83 21.456248 -0.703064 1\n", "84 21.031956 -3.553655 1\n", "85 -15.238078 -8.995941 2\n", "86 -4.814919 10.791582 0\n", "87 20.818377 -2.222549 1\n", "88 -4.871783 10.416224 0\n", "89 -4.368715 11.821173 0\n", "90 21.399042 -6.057173 1\n", "91 -14.306515 -7.622844 2\n", "92 -14.983878 -8.366204 2\n", "93 -4.870200 11.101929 0\n", "94 -15.452853 -8.695038 2\n", "95 -6.021197 9.797784 0\n", "96 -13.303403 -8.533546 2\n", "97 19.071838 -2.823596 1\n", "98 18.448396 -3.646438 1\n", "99 -4.233015 10.157180 0\n", "\n", "[100 rows x 3 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_X_df = pd.DataFrame(new_X, columns = [\"PC1\", \"PC2\"])\n", "new_X_df[\"cluster\"] = y\n", "new_X_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use a scatter plot to visualize the cluster in 2 dimensions." ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.relplot(x = \"PC1\", y = \"PC2\", hue = \"cluster\", data = new_X_df,\\\n", " palette = [\"blue\",\"orange\",\"green\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run k-means clustering to predict the clusters." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 0, 0, 0, 0, 2, 1, 1, 2, 0, 1, 2, 1, 0, 1, 0, 0, 1, 2, 0, 1, 1,\n", " 2, 2, 1, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,\n", " 1, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 2, 1, 2, 2, 1, 1, 0, 2, 0, 1, 0,\n", " 0, 0, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 0, 1, 0, 2, 1, 0, 0, 2, 1, 1,\n", " 2, 1, 0, 0, 0, 0, 0, 1, 2, 1, 1, 0], dtype=int32)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kmeans = KMeans(n_clusters = 3)\n", "kmeans_clusters = kmeans.fit_predict(X)\n", "kmeans_clusters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Store the predicted cluster in the dataframe you created." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "new_X_df[\"predicted\"] = kmeans_clusters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute the confusion matrix between the actual and predicted values. How accurate was k-means?" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[34, 0, 0],\n", " [ 0, 0, 33],\n", " [ 0, 33, 0]])" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "confusion_matrix(new_X_df[\"cluster\"],new_X_df[\"predicted\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What happens to the above analysis as you increase the number of features?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What happens to the above analysis if you use 3 features, but increase the number of clusters?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What happens if you increase both the number of features and the number of clusters?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following method also simulates data. What kind of data is it? Hint: try looking at the data and plotting it" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X, y = make_moons(noise = 0.05)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run k-means clustering to predict clusters." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How accurate is k-means cluster on this dataset?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What happens to the accuracy if you increase the noise parameter?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How does hierarchical clustering perform on the above data sets? " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There is also a `make_circles()` function. What does it do? The documentation is [here](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_circles.html#sklearn.datasets.make_circles)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.8" } }, "nbformat": 4, "nbformat_minor": 2 }